Lets us read the file.

library(readr)
lyrics <- read_csv("songdata.csv")
Parsed with column specification:
cols(
  artist = col_character(),
  song = col_character(),
  link = col_character(),
  text = col_character()
)
head(lyrics)

Lets us examine the dimension of the lyrics dataframe.

dim(lyrics)
[1] 57650     4
library(dplyr)
glimpse(lyrics)
Observations: 57,650
Variables: 4
$ artist <chr> "ABBA", "ABBA", "ABBA", "ABBA", "ABBA", "ABBA", "ABBA", "ABBA", "ABBA", "ABBA", "ABBA", "ABBA", "A...
$ song   <chr> "Ahe's My Kind Of Girl", "Andante, Andante", "As Good As New", "Bang", "Bang-A-Boomerang", "Burnin...
$ link   <chr> "/a/abba/ahes+my+kind+of+girl_20598417.html", "/a/abba/andante+andante_20002708.html", "/a/abba/as...
$ text   <chr> "Look at her face, it's a wonderful face  \nAnd it means something special to me  \nLook at the wa...

Analysis of 55000+ lyrics data - Number of artists - Which artist has highest and lowest number of songs - Distribution of songs of all artists in the dataset - Distribution of lyrics length - Which song lyrics has maximum number of words - Which song lyrics has minimum number of words - Distribution of words count in title - Which songs title has maximum number of words - Which songs title has minimum number of words - WordClouds of titles with minimum and maximum lengths - Is there a relation between title length and song length?

Let’s start with finding out how many artists are listed in the data. Also, how many songs each artist has.

artist<- as.data.frame(table(as.data.frame(lyrics$artist)))
colnames(artist) <- c("artist", "Num_of_songs")
head(artist)

Let’s see the which artist has most and least number of songs in the dataset.

most_songs <- arrange(artist, desc(Num_of_songs))
most_songs

least_songs <- tail(most_songs, 15)
p2 <- ggplot(data = least_songs, aes(artist, Num_of_songs, fill = Num_of_songs)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=Num_of_songs), vjust=1.6, color="white", size=3) +
      ggtitle("Artists with least number of songs") +
      tilt_theme
p2

Let’s check the distribution of songs for all artists.

p3 <- ggplot(artist, aes(x=Num_of_songs)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="red")
p3

Let’s analyze the number of words in each song and its distribution.

library(stringr)
count_words <- function(vec){
  return (length(unlist((str_extract_all(tolower(vec), '\\w+')))))
}
lyrics$word_count <- sapply(lyrics$text, count_words)
head(lyrics$word_count)
[1] 161 272 322 257 255 115
p4 <- ggplot(lyrics, aes(x=word_count)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="red")
p4

Let’s analyze the title of the songs, their wordcount and their distribution

lyrics$title_word_count <- sapply(lyrics$song, count_words)
head(lyrics$title_word_count)
[1] 6 2 4 1 3 3

Let’s check out the songs that are longest and shortest.

longest_song <- arrange(lyrics, desc(word_count))
longest_song <- head(longest_song, 10)
shortest_song <- arrange(lyrics, word_count)
shortest_song <- head(shortest_song, 10)
longest_song
shortest_song
p5 <- ggplot(data = longest_song, aes(song, word_count, fill = title_word_count)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=title_word_count), vjust=1.6, color="white", size=3) +
      ggtitle("Longest Songs") +
      tilt_theme
p6 <- ggplot(data = shortest_song, aes(song, word_count, fill = title_word_count)) +
      geom_bar(position = "dodge", stat = "identity") +
      geom_text(aes(label = title_word_count), vjust = 1.6, color = "white", size = 3) +
      ggtitle("Shortest Songs") +
      tilt_theme
multiplot(p5, p6, cols=2)

p7 <- ggplot(lyrics, aes(x=title_word_count)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white", binwidth = 1, bins = 1)+
 geom_density(alpha=.2, fill="red")
p7

WordCloud of popular words from song titles

library(wordcloud)
library(SnowballC)
library(RColorBrewer)
library(tm)
texts <- lyrics$song
#texts <- iconv(texts, to = "utf-8")
corpus <- Corpus(VectorSource(texts))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, c("and", "this", "there")) 
corpus <- Corpus(VectorSource(corpus))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
d <- d[-which(d$word %in% c("and","this","that")),]
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

There are many song titles that are of length 1, 2 and 3. But surprisingly, there are titles of length more than 13 too. Let’s check them out.

longest_title <- subset(lyrics, lyrics$title_word_count > 13)
longest_title
shortest_title <- subset(lyrics, lyrics$title_word_count == 1)
shortest_title

There are 8 songs with title length more than 13 and 8342 songs with single word title. Let’s see word cloud of single word titles and longest titles

texts <- longest_title$song
corpus <- Corpus(VectorSource(texts))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- Corpus(VectorSource(corpus))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,scale=c(2,0.5),
          max.words=100, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

texts <- shortest_title$song
corpus <- Corpus(VectorSource(texts))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- Corpus(VectorSource(corpus))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,scale=c(2,0.5),
          max.words=100, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

An interesting questin would be is there relation between length of title and songs? Most probably now, but let’s check out.

p8 <- ggplot(lyrics, aes(x=factor(title_word_count), y=word_count, fill = factor(title_word_count))) + 
  geom_boxplot() 
p8 

cor(lyrics$title_word_count, lyrics$word_count)
[1] -0.02509779

As expected, there is no correlation between these two quantitites.

Let us fix the contracted words to their full forms first.

# function to expand contractions in an English-language source
fix.contractions <- function(doc) {
  # "won't" is a special case as it does not expand to "wo not"
  doc <- gsub("won't", "will not", doc)
  doc <- gsub("can't", "can not", doc)
  doc <- gsub("n't", " not", doc)
  doc <- gsub("'ll", " will", doc)
  doc <- gsub("'re", " are", doc)
  doc <- gsub("'ve", " have", doc)
  doc <- gsub("'m", " am", doc)
  doc <- gsub("'d", " would", doc)
  # 's could be 'is' or could be possessive: it has no expansion
  doc <- gsub("'s", "", doc)
  return(doc)
}
# fix (expand) contractions
lyrics$text <- sapply(lyrics$text, fix.contractions)

Remove special characters from lyrics

# function to remove special characters
removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]", " ", x)
# remove special characters
lyrics$text <- sapply(lyrics$text, removeSpecialChars)

Convert all lyrics text to lower case

# convert everything to lower case
lyrics$text <- sapply(lyrics$text, tolower)

Let’s check the structure of one lyrics to see the changes.

str(lyrics[13, ]$text, nchar.max = 300)
 chr "changing  moving in a circle   i can see your face in all of my dreams   smiling  laughing from the shadows   when i hear your voice  i know what it means   i know it does not matter just how hard i try   you are all the reason for my life      disillusion  disillusion all you left "| __truncated__

SENTIMENT ANALYSIS OF LYRICS Let us perform sentiment analysis on the lyrics. There are various types of sentiment lexicons that can be used. Lets us have a look on them.

library(tidytext)
library(tidyr)
get_sentiments("afinn")
get_sentiments("bing")
get_sentiments("nrc")
get_sentiments("loughran")

nrc seems to have large number of words and their sentiments compared to other two.

nrc_sentiment <- get_sentiments("nrc")
unique(nrc_sentiment$sentiment)
 [1] "trust"        "fear"         "negative"     "sadness"      "anger"        "surprise"     "positive"    
 [8] "disgust"      "joy"          "anticipation"

Let us find the sentiments of each lyrics based on each NRC sentiments.

lyrics_words <- select(lyrics, c("artist", "text"))
lyrics_words <- lyrics_words %>% unnest_tokens(word, text)
head(lyrics_words)

Let’s see words that depict “joy”

joy <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "joy")) 
Joining, by = "word"
joy <- as.data.frame(sort(table(joy$word)))
columns_sentiment <- c("word", "Freq")
colnames(joy) <- columns_sentiment
tail(joy, 10)

Similarly, let’s see other words that represents other 9 sentiments.

trust <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "trust")) 
Joining, by = "word"
trust <- as.data.frame(sort(table(trust$word)))
colnames(trust) <- columns_sentiment
tail(trust, 10)
fear <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "fear")) 
Joining, by = "word"
fear <- as.data.frame(sort(table(fear$word)))
colnames(fear) <- columns_sentiment
sadness <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "sadness")) 
Joining, by = "word"
sadness <- as.data.frame(sort(table(sadness$word)))
colnames(sadness) <- columns_sentiment
anger <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "anger")) 
Joining, by = "word"
anger <- as.data.frame(sort(table(anger$word)))
colnames(anger) <- columns_sentiment
surprise <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "surprise")) 
Joining, by = "word"
surprise <- as.data.frame(sort(table(surprise$word)))
colnames(surprise) <- columns_sentiment
disgust <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "disgust")) 
Joining, by = "word"
disgust <- as.data.frame(sort(table(disgust$word)))
colnames(disgust) <- columns_sentiment
anticipation <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "anticipation")) 
Joining, by = "word"
anticipation <- as.data.frame(sort(table(anticipation$word)))
colnames(anticipation) <- columns_sentiment

Let’s plot the word occurences for each sentiment (except positive and negative)

Let us check top words that depicts “positive” and “negative” sentiments in NRC Sentiment category.

Joining, by = "word"
Joining, by = "word"

Let us check top words that depicts “positive” and “negative” sentiments in bing Sentiment category.

Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"

It can be seen that positive and negative words are different for all three lexicons.

---
title: "Lyrics Analysis"
output: html_notebook
---

Lets us read the file.

```{r}
library(readr)
lyrics <- read_csv("songdata.csv")
head(lyrics)
```

Lets us examine the dimension of the lyrics dataframe.

```{r}
dim(lyrics)
```

```{r}
library(dplyr)
glimpse(lyrics)
```

Analysis of 55000+ lyrics data
- Number of artists
- Which artist has highest and lowest number of songs 
- Distribution of songs of all artists in the dataset
- Distribution of lyrics length
- Which song lyrics has maximum number of words
- Which song lyrics has minimum number of words
- Distribution of words count in title
- Which songs title has maximum number of words 
- Which songs title has minimum number of words
- WordClouds of titles with minimum and maximum lengths
- Is there a relation between title length and song length?


- Sentiments of the songs (NRC, Bing)
- Which words are most occuring in the lyrics of the songs
- Is there a correlation between the words in the songs of same artists?
- Wordcloud of most popular words in the songs
- Top words used by an artist in his/her songs
- Are there some common Rythmic words that repeats again and again?


Let's start with finding out how many artists are listed in the data. Also, how many songs each artist has.

```{r}
artist<- as.data.frame(table(as.data.frame(lyrics$artist)))
colnames(artist) <- c("artist", "Num_of_songs")
head(artist)
```

Let's see the which artist has most and least number of songs in the dataset.

```{r}
most_songs <- arrange(artist, desc(Num_of_songs))
most_songs
```

```{r fig.width=5, fig.height=3, echo=FALSE}
library(ggplot2)
library(Rmisc)
tilt_theme <- theme(axis.text.x=element_text(angle=45, hjust=1))
p1 <- ggplot(data = head(most_songs,10), aes(artist, Num_of_songs, fill = Num_of_songs)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=Num_of_songs), vjust=1.6, color="white", size=3) +
      ggtitle("Artists with most number of songs") +
      tilt_theme
p1
```

```{r}
least_songs <- tail(most_songs, 15)
p2 <- ggplot(data = least_songs, aes(artist, Num_of_songs, fill = Num_of_songs)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=Num_of_songs), vjust=1.6, color="white", size=3) +
      ggtitle("Artists with least number of songs") +
      tilt_theme
p2
```

Let's check the distribution of songs for all artists.

```{r}
p3 <- ggplot(artist, aes(x=Num_of_songs)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="red")
p3
```

Let's analyze the number of words in each song and its distribution.

```{r}
library(stringr)
count_words <- function(vec){
  return (length(unlist((str_extract_all(tolower(vec), '\\w+')))))
}
lyrics$word_count <- sapply(lyrics$text, count_words)
head(lyrics$word_count)
```

```{r}
p4 <- ggplot(lyrics, aes(x=word_count)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="red")
p4
```

Let's analyze the title of the songs, their wordcount and their distribution

```{r}
lyrics$title_word_count <- sapply(lyrics$song, count_words)
head(lyrics$title_word_count)
```

Let's check out the songs that are longest and shortest.

```{r}
longest_song <- arrange(lyrics, desc(word_count))
longest_song <- head(longest_song, 10)
shortest_song <- arrange(lyrics, word_count)
shortest_song <- head(shortest_song, 10)
longest_song
shortest_song
```



```{r}
p5 <- ggplot(data = longest_song, aes(song, word_count, fill = title_word_count)) +
      geom_bar(stat = "identity") +
      geom_text(aes(label=title_word_count), vjust=1.6, color="white", size=3) +
      ggtitle("Longest Songs") +
      tilt_theme
p6 <- ggplot(data = shortest_song, aes(song, word_count, fill = title_word_count)) +
      geom_bar(position = "dodge", stat = "identity") +
      geom_text(aes(label = title_word_count), vjust = 1.6, color = "white", size = 3) +
      ggtitle("Shortest Songs") +
      tilt_theme
multiplot(p5, p6, cols=2)
```




```{r}
p7 <- ggplot(lyrics, aes(x=title_word_count)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white", binwidth = 1, bins = 1)+
 geom_density(alpha=.2, fill="red")
p7
```

WordCloud of popular words from song titles

```{r}
library(wordcloud)
library(SnowballC)
library(RColorBrewer)
library(tm)
texts <- lyrics$song
#texts <- iconv(texts, to = "utf-8")
corpus <- Corpus(VectorSource(texts))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, c("and", "this", "there")) 
corpus <- Corpus(VectorSource(corpus))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
d <- d[-which(d$word %in% c("and","this","that")),]
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
```

There are many song titles that are of length 1, 2 and 3. But surprisingly, there are titles of length more than 13 too. Let's check them out.

```{r}
longest_title <- subset(lyrics, lyrics$title_word_count > 13)
longest_title
shortest_title <- subset(lyrics, lyrics$title_word_count == 1)
shortest_title
```

There are 8 songs with title length more than 13 and 8342 songs with single word title. Let's see word cloud of single word titles and longest titles

```{r}
texts <- longest_title$song
corpus <- Corpus(VectorSource(texts))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- Corpus(VectorSource(corpus))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,scale=c(2,0.5),
          max.words=100, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
```

```{r}
texts <- shortest_title$song
corpus <- Corpus(VectorSource(texts))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- Corpus(VectorSource(corpus))
dtm <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,scale=c(2,0.5),
          max.words=100, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))
```

An interesting questin would be is there relation between length of title and songs? Most probably now, but let's check out.

```{r}
p8 <- ggplot(lyrics, aes(x=factor(title_word_count), y=word_count, fill = factor(title_word_count))) + 
  geom_boxplot() 
p8 
```

```{r}
cor(lyrics$title_word_count, lyrics$word_count)
```
 As expected, there is no correlation between these two quantitites.
 
 Let us fix the contracted words to their full forms first.

```{r}
# function to expand contractions in an English-language source
fix.contractions <- function(doc) {
  # "won't" is a special case as it does not expand to "wo not"
  doc <- gsub("won't", "will not", doc)
  doc <- gsub("can't", "can not", doc)
  doc <- gsub("n't", " not", doc)
  doc <- gsub("'ll", " will", doc)
  doc <- gsub("'re", " are", doc)
  doc <- gsub("'ve", " have", doc)
  doc <- gsub("'m", " am", doc)
  doc <- gsub("'d", " would", doc)
  # 's could be 'is' or could be possessive: it has no expansion
  doc <- gsub("'s", "", doc)
  return(doc)
}

# fix (expand) contractions
lyrics$text <- sapply(lyrics$text, fix.contractions)
```
 
 Remove special characters from lyrics
```{r}
# function to remove special characters
removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]", " ", x)
# remove special characters
lyrics$text <- sapply(lyrics$text, removeSpecialChars)
```

Convert all lyrics text to lower case
```{r}
# convert everything to lower case
lyrics$text <- sapply(lyrics$text, tolower)
```

Let's check the structure of one lyrics to see the changes.
```{r}
str(lyrics[13, ]$text, nchar.max = 300)
```
 
 SENTIMENT ANALYSIS OF LYRICS
 Let us perform sentiment analysis on the lyrics. There are various types of sentiment lexicons that can be used. Lets us have a look on them.
```{r}
library(tidytext)
library(tidyr)
get_sentiments("afinn")
get_sentiments("bing")
get_sentiments("nrc")
get_sentiments("loughran")
```

nrc seems to have large number of words and their sentiments compared to other two.

```{r}
nrc_sentiment <- get_sentiments("nrc")
unique(nrc_sentiment$sentiment)
```

Let us find the sentiments of each lyrics based on each NRC sentiments.

```{r}
lyrics_words <- select(lyrics, c("artist", "text"))
lyrics_words <- lyrics_words %>% unnest_tokens(word, text)
head(lyrics_words)
dim(lyrics_words)
```

Let's see words that depict "joy"
```{r}
joy <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "joy")) 
joy <- as.data.frame(sort(table(joy$word)))
columns_sentiment <- c("word", "Freq")
colnames(joy) <- columns_sentiment
tail(joy, 10)

```

Similarly, let's see other words that represents other 9 sentiments.

```{r}
trust <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "trust")) 
trust <- as.data.frame(sort(table(trust$word)))
colnames(trust) <- columns_sentiment
tail(trust, 10)
```

```{r}
fear <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "fear")) 
fear <- as.data.frame(sort(table(fear$word)))
colnames(fear) <- columns_sentiment

sadness <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "sadness")) 
sadness <- as.data.frame(sort(table(sadness$word)))
colnames(sadness) <- columns_sentiment

anger <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "anger")) 
anger <- as.data.frame(sort(table(anger$word)))
colnames(anger) <- columns_sentiment

surprise <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "surprise")) 
surprise <- as.data.frame(sort(table(surprise$word)))
colnames(surprise) <- columns_sentiment

disgust <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "disgust")) 
disgust <- as.data.frame(sort(table(disgust$word)))
colnames(disgust) <- columns_sentiment

anticipation <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "anticipation")) 
anticipation <- as.data.frame(sort(table(anticipation$word)))
colnames(anticipation) <- columns_sentiment
```

Let's plot the word occurences for each sentiment (except positive and negative)

```{r fig.width=8, fig.height=4, echo=FALSE}
p9 <- ggplot(data = tail(joy, 10), aes(word, Freq, fill = word)) +
      geom_bar(position = "dodge", stat = "identity") +
      geom_text(aes(label = Freq), vjust = 1.6, color = "white", size = 3) +
      ggtitle("Joy") +
      guides(fill=FALSE) +
      tilt_theme
p10 <- ggplot(data = tail(trust, 10), aes(word, Freq, fill = word)) +
      geom_bar(position = "dodge", stat = "identity") +
      geom_text(aes(label = Freq), vjust = 1.6, color = "white", size = 3) +
      ggtitle("Trust") +
      guides(fill=FALSE) +
      tilt_theme
p11 <- ggplot(data = tail(fear, 10), aes(word, Freq, fill = word)) +
      geom_bar(position = "dodge", stat = "identity") +
      geom_text(aes(label = Freq), vjust = 1.6, color = "white", size = 3) +
      ggtitle("Fear") +
      guides(fill=FALSE) +
      tilt_theme
p12 <- ggplot(data = tail(sadness, 10), aes(word, Freq, fill = word)) +
      geom_bar(position = "dodge", stat = "identity") +
      geom_text(aes(label = Freq), vjust = 1.6, color = "white", size = 3) +
      ggtitle("Sadness") +
      guides(fill=FALSE) +
      tilt_theme
p13 <- ggplot(data = tail(anger, 10), aes(word, Freq, fill = word)) +
      geom_bar(position = "dodge", stat = "identity") +
      geom_text(aes(label = Freq), vjust = 1.6, color = "white", size = 3) +
      ggtitle("Anger") +
      guides(fill=FALSE) +
      tilt_theme
p14 <- ggplot(data = tail(surprise, 10), aes(word, Freq, fill = word)) +
      geom_bar(position = "dodge", stat = "identity") +
      geom_text(aes(label = Freq), vjust = 1.6, color = "white", size = 3) +
      ggtitle("Surprise") +
      guides(fill=FALSE) +
      tilt_theme
p15 <- ggplot(data = tail(disgust, 10), aes(word, Freq, fill = word)) +
      geom_bar(position = "dodge", stat = "identity") +
      geom_text(aes(label = Freq), vjust = 1.6, color = "white", size = 3) +
      ggtitle("Disgust") +
      guides(fill=FALSE) +
      tilt_theme
p16 <- ggplot(data = tail(anticipation, 10), aes(word, Freq, fill = word)) +
      geom_bar(position = "dodge", stat = "identity") +
      geom_text(aes(label = Freq), vjust = 1.6, color = "white", size = 3) +
      ggtitle("Anticipation") +
      guides(fill=FALSE) +
      tilt_theme

multiplot(p9, p10, p11, p12, p13, p14, p15, p16, layout = matrix(c(1,2,3,4,5,6,7,8), nrow=2, byrow=TRUE))


```

Let us check top words that depicts "positive" and "negative" sentiments in NRC Sentiment category.

```{r fig.width=5, fig.height=2, echo=FALSE}
pos <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "positive")) 
pos <- as.data.frame(sort(table(pos$word)))
colnames(pos) <- columns_sentiment

neg <- lyrics_words %>%
  inner_join(get_sentiments("nrc") %>% 
  filter(sentiment == "negative")) 
neg <- as.data.frame(sort(table(neg$word)))
colnames(neg) <- columns_sentiment

p17 <- ggplot(data = tail(pos, 20), aes(word, Freq, fill = word)) +
      geom_bar(position = "dodge", stat = "identity") +
      geom_text(aes(label = Freq), vjust = 1.6, color = "black", size = 3, angle=90) +
      ggtitle("Positive (NRC)") +
      guides(fill=FALSE) +
      tilt_theme
p18 <- ggplot(data = tail(neg, 20), aes(word, Freq, fill = word)) +
      geom_bar(position = "dodge", stat = "identity") +
      geom_text(aes(label = Freq), vjust = 1.6, color = "black", size = 3, angle=90) +
      ggtitle("Negative (NRC)") +
      guides(fill=FALSE) +
      tilt_theme
multiplot(p17, p18, cols=2)
```

Let us check top words that depicts "positive" and "negative" sentiments in bing Sentiment category.

```{r fig.width=10, fig.height=5, echo=FALSE}
pos_b <- lyrics_words %>%
  inner_join(get_sentiments("bing") %>% 
  filter(sentiment == "positive")) 
pos_b <- as.data.frame(sort(table(pos_b$word)))
colnames(pos_b) <- columns_sentiment

neg_b <- lyrics_words %>%
  inner_join(get_sentiments("bing") %>% 
  filter(sentiment == "negative")) 
neg_b <- as.data.frame(sort(table(neg_b$word)))
colnames(neg_b) <- columns_sentiment

pos_l <- lyrics_words %>% 
  inner_join(get_sentiments("loughran") %>% 
  filter(sentiment == "positive")) 
pos_l <- as.data.frame(sort(table(pos_l$word)))
colnames(pos_l) <- columns_sentiment

neg_l <- lyrics_words %>%
  inner_join(get_sentiments("loughran") %>% 
  filter(sentiment == "negative")) 
neg_l <- as.data.frame(sort(table(neg_l$word)))
colnames(neg_l) <- columns_sentiment

p19 <- ggplot(data = tail(pos_b, 20), aes(word, Freq, fill = word)) +
      geom_bar(position = "dodge", stat = "identity") +
      geom_text(aes(label = Freq), vjust = 1.6, color = "black", size = 3, angle=90) +
      ggtitle("Positive (BING)") +
      guides(fill=FALSE) +
      tilt_theme
p20 <- ggplot(data = tail(neg_b, 20), aes(word, Freq, fill = word)) +
      geom_bar(position = "dodge", stat = "identity") +
      geom_text(aes(label = Freq), vjust = 1.6, color = "black", size = 3, angle=90) +
      ggtitle("Negative (BING)") +
      guides(fill=FALSE) +
      tilt_theme
p21 <- ggplot(data = tail(pos_l, 20), aes(word, Freq, fill = word)) +
      geom_bar(position = "dodge", stat = "identity") +
      geom_text(aes(label = Freq), vjust = 1.6, color = "black", size = 3, angle=90) +
      ggtitle("Positive (LOUGHRAN)") +
      guides(fill=FALSE) +
      tilt_theme
p22 <- ggplot(data = tail(neg_l, 20), aes(word, Freq, fill = word)) +
      geom_bar(position = "dodge", stat = "identity") +
      geom_text(aes(label = Freq), vjust = 1.6, color = "black", size = 3, angle=90) +
      ggtitle("Negative (LOUGHRAN)") +
      guides(fill=FALSE) +
      tilt_theme
multiplot(p19, p20,p21, p22,layout = matrix(c(1,2,3,4), nrow=2, byrow=TRUE))

```

It can be seen that positive and negative words are different for all three lexicons. 

